#!/bin/bash

archive=$1
txt_dir='txt'
xml_dir='xml'
final_dir_prefix='wikipedia-talkpages-opencorpora'

if [ -z "$archive" ]
then
    echo "Dump archive not specified"
    exit 1
fi

if [ ${archive:(-3)} == "bz2" ]
then
    echo "Decompressing dump..."

    decompressor=`which pbzip2`
    [[ -z "$decompressor" ]] && decompressor=`which bzip2`

    $decompressor -d $archive
    if [ "$?" -ne 0 ]
    then
        echo "Decompression failed"
        exit 1
    fi
fi

dump=${archive/.bz2/}

echo "Creating XML files..."
rm -fr $xml_dir
mkdir $xml_dir

./convert-to-text $dump | ./remove-stub | ./strip-html | ./strip-markup | ./parse-talkpage | ./convert-to-xml $xml_dir

echo "Cleaning up..."
mv $xml_dir $final_dir_prefix-$(date +%F)
rm -fr $txt_dir /tmp/wiki_txt_list
[[ "${archive:(-3)}" == "bz2" ]] && rm $dump
